Comment: Reading the dataset

library(readxl)
IMDB_data <- read_excel("~/Desktop/Datasets/IMDB data.xlsm")
View(IMDB_data)

Comments: Statistics

head(IMDB_data)
## # A tibble: 6 x 12
##    Rank Title Genre Description Director Actors  Year `Runtime (Minut…
##   <dbl> <chr> <chr> <chr>       <chr>    <chr>  <dbl>            <dbl>
## 1     1 Guar… Acti… A group of… James G… Chris…  2014              121
## 2     2 Prom… Adve… Following … Ridley … Noomi…  2012              124
## 3     3 Split Horr… Three girl… M. Nigh… James…  2016              117
## 4     4 Sing  Anim… In a city … Christo… Matth…  2016              108
## 5     5 Suic… Acti… A secret g… David A… Will …  2016              123
## 6     6 The … Acti… European m… Yimou Z… Matt …  2016              103
## # ... with 4 more variables: Rating <dbl>, Votes <dbl>, `Revenue
## #   (Millions)` <dbl>, Metascore <dbl>
typeof(IMDB_data)
## [1] "list"
str(IMDB_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1000 obs. of  12 variables:
##  $ Rank              : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title             : chr  "Guardians of the Galaxy" "Prometheus" "Split" "Sing" ...
##  $ Genre             : chr  "Action,Adventure,Sci-Fi" "Adventure,Mystery,Sci-Fi" "Horror,Thriller" "Animation,Comedy,Family" ...
##  $ Description       : chr  "A group of intergalactic criminals are forced to work together to stop a fanatical warrior from taking control "| __truncated__ "Following clues to the origin of mankind, a team finds a structure on a distant moon, but they soon realize the"| __truncated__ "Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before t"| __truncated__ "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing compe"| __truncated__ ...
##  $ Director          : chr  "James Gunn" "Ridley Scott" "M. Night Shyamalan" "Christophe Lourdelet" ...
##  $ Actors            : chr  "Chris Pratt, Vin Diesel, Bradley Cooper, Zoe Saldana" "Noomi Rapace, Logan Marshall-Green, Michael Fassbender, Charlize Theron" "James McAvoy, Anya Taylor-Joy, Haley Lu Richardson, Jessica Sula" "Matthew McConaughey,Reese Witherspoon, Seth MacFarlane, Scarlett Johansson" ...
##  $ Year              : num  2014 2012 2016 2016 2016 ...
##  $ Runtime (Minutes) : num  121 124 117 108 123 103 128 89 141 116 ...
##  $ Rating            : num  8.1 7 7.3 7.2 6.2 6.1 8.3 6.4 7.1 7 ...
##  $ Votes             : num  757074 485820 157606 60545 393727 ...
##  $ Revenue (Millions): num  333 126 138 270 325 ...
##  $ Metascore         : num  76 65 62 59 40 42 93 71 78 41 ...
var(IMDB_data$Votes)
## [1] 35631337098
var(IMDB_data$`Runtime (Minutes)`)
## [1] 353.8503
var(IMDB_data$`Revenue (Millions)`)
## [1] NA
summary(IMDB_data)
##       Rank           Title              Genre           Description       
##  Min.   :   1.0   Length:1000        Length:1000        Length:1000       
##  1st Qu.: 250.8   Class :character   Class :character   Class :character  
##  Median : 500.5   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 500.5                                                           
##  3rd Qu.: 750.2                                                           
##  Max.   :1000.0                                                           
##                                                                           
##    Director            Actors               Year      Runtime (Minutes)
##  Length:1000        Length:1000        Min.   :2006   Min.   : 66.0    
##  Class :character   Class :character   1st Qu.:2010   1st Qu.:100.0    
##  Mode  :character   Mode  :character   Median :2014   Median :111.0    
##                                        Mean   :2013   Mean   :113.2    
##                                        3rd Qu.:2016   3rd Qu.:123.0    
##                                        Max.   :2016   Max.   :191.0    
##                                                                        
##      Rating          Votes         Revenue (Millions)   Metascore     
##  Min.   :1.900   Min.   :     61   Min.   :  0.00     Min.   : 11.00  
##  1st Qu.:6.200   1st Qu.:  36309   1st Qu.: 13.27     1st Qu.: 47.00  
##  Median :6.800   Median : 110799   Median : 47.98     Median : 59.50  
##  Mean   :6.723   Mean   : 169808   Mean   : 82.96     Mean   : 58.99  
##  3rd Qu.:7.400   3rd Qu.: 239910   3rd Qu.:113.72     3rd Qu.: 72.00  
##  Max.   :9.000   Max.   :1791916   Max.   :936.63     Max.   :100.00  
##                                    NA's   :128        NA's   :64
summary(IMDB_data$Year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2006    2010    2014    2013    2016    2016
summary(IMDB_data$`Runtime (Minutes)`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    66.0   100.0   111.0   113.2   123.0   191.0
summary(IMDB_data$Rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.900   6.200   6.800   6.723   7.400   9.000
summary(IMDB_data$Votes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      61   36309  110799  169808  239910 1791916
summary(IMDB_data$`Revenue (Millions)`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   13.27   47.98   82.96  113.72  936.63     128
summary(IMDB_data$Metascore)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   11.00   47.00   59.50   58.99   72.00  100.00      64
sum(is.na(IMDB_data))
## [1] 192
sum(is.na(IMDB_data$`Revenue (Millions)`))
## [1] 128
sum(is.na(IMDB_data$Metascore))
## [1] 64

Comment: Making Categories

votes_categories <- cut(IMDB_data$Votes, breaks = c(60, 37000, 120000, 240000, 1800000), labels = c("Low_Votes", "Medium_Votes", "High_Votes", "Highest_Votes"))
Rating_categories <- cut(IMDB_data$Rating, breaks = c(0, 4, 7, 10), labels = c("Low_Ratings", "Medium Ratings", "High_Ratings"))
Revenue_categories <- cut(IMDB_data$`Revenue (Millions)`, breaks = c(0, 47, 113, 940), labels = c("Low_Revenue", "Medium_Revenue", "High_Revenue"))
Metascore_categories <- cut(IMDB_data$Metascore, breaks = c(10, 46, 60, 72, 100), labels = c("Lowest_Metascore", "Low_Metascore", "Medium_Metascore", "High_Metascore"))

Comment: Binding into dataset

IMDB_categorized_data <- cbind(IMDB_data, votes_categories, Rating_categories, Revenue_categories, Metascore_categories)

Comments: Plotting

plot(IMDB_categorized_data$Rank[votes_categories == "Low_Votes"], IMDB_categorized_data$Rating[votes_categories == "Low_Votes"], main = "Rank vs Ratings 1", xlab = "Rank", ylab = "Ratings", col = "red", las=1)

Comments: It can be seen that a majority of movies lie in the rating range of 5-8. While on the other hand there is a large variation in rank of movies of approximately same rating. Perhaps we will have to explore the plot on more variables to find the reason for variation.

plot(IMDB_categorized_data$Rank[votes_categories == "Medium_Votes"], IMDB_categorized_data$Rating[votes_categories == "Medium_Votes"], main = "Rank vs Ratings 2", xlab = "Rank", ylab = "Ratings", las=1)

Comments: The plot is more compressed for medium_votes range as ratings on Y-axis vary from 6-8 for a majority of movies. While on the other hand the disparity of rank remains the same.

plot(IMDB_categorized_data$Rank[votes_categories == "High_Votes"], IMDB_categorized_data$Rating[votes_categories == "High_Votes"], main = "Rank vs Ratings 3", xlab = "Rank", ylab = "Ratings", las=1)

Comments: Like the plot for Medium_Votes, it can be seen that a majority of movies lie in the rating range of 6-8. While on the other hand there is a large variation in rank of movies of approximately same rating. Perhaps we will have to explore the plot on more variables to find the reason for variation.

plot(IMDB_categorized_data$Rank[votes_categories == "Highest_Votes"], IMDB_categorized_data$Rating[votes_categories == "Highest_Votes"], main = "Rank vs Ratings 4", xlab = "Rank", ylab = "Ratings", las=1)

Comments: The plot is more compressed for highest_votes range as ratings on Y-axis vary from 6.5-8 for a majority of movies. Also it can be seen that there is a huge concentration of movies from rank 100-450.

plot(IMDB_categorized_data$Rank[Revenue_categories == "Low_Revenue"], IMDB_categorized_data$Rating[Revenue_categories == "Low_Revenue"], main = "Rank vs Ratings 5", xlab = "Rank", ylab = "Ratings", las=1)

Comments: It can be seen that for low_revenue, the ratings vary for 5.5-8 for a majority of movies. But there is a vast divergence in terms of rank.

plot(IMDB_categorized_data$Rank[Revenue_categories == "Medium_Revenue"], IMDB_categorized_data$Rating[Revenue_categories == "Medium_Revenue"], main = "Rank vs Ratings 6", xlab = "Rank", ylab = "Ratings", las=1)

Comments: The rating vary from 6-8 from a majority of movies while the rank varies from 10-1000.

plot(IMDB_categorized_data$Rank[Revenue_categories == "High_Revenue"], IMDB_categorized_data$Rating[Revenue_categories == "High_Revenue"], main = "Rank vs Ratings 7", xlab = "Rank", ylab = "Ratings", las=1)

Comments: It can be seen that ratings vary from 6-8 and also there is a huge concentration of points from 0-400 rank. Hence we can conclude that movies which generate a high revenue generally have a rating above 6 and rank below 400 with some exceptions.

plot(IMDB_categorized_data$Rank[Metascore_categories == "Lowest_Metascore"], IMDB_categorized_data$Rating[Metascore_categories == "Lowest_Metascore"], main = "Rank vs Ratings 8", xlab = "Rank", ylab = "Ratings", las=1)

Comments: It can be seen that for lowest_metascore categories ratings usually vary in the range of 5-7, while the rank is usually above 350. Hence we can conclude that metascore depends on rank as well as ratings for lowest_metascore categories.

plot(IMDB_categorized_data$Rank[Metascore_categories == "Low_Metascore"], IMDB_categorized_data$Rating[Metascore_categories == "Low_Metascore"], main = "Rank vs Ratings 9", xlab = "Rank", ylab = "Ratings", las=1)

Comments: It can be seen that for low_metascore categories ratings usually vary in the range of 6-8, while the rank is varying from 10-1000.

plot(IMDB_categorized_data$Rank[Metascore_categories == "Medium_Metascore"], IMDB_categorized_data$Rating[Metascore_categories == "Medium_Metascore"], main = "Rank vs Ratings 10", xlab = "Rank", ylab = "Ratings", las=1)

Comments: It can be seen that for medium_metascore categories ratings usually vary in the range of 6.5-8, while the rank has a large variance.

plot(IMDB_categorized_data$Rank[Metascore_categories == "High_Metascore"], IMDB_categorized_data$Rating[Metascore_categories == "High_Metascore"], main = "Rank vs Ratings 11", xlab = "Rank", ylab = "Ratings", las=1)

Comments: It can be seen that for high_metascore categories ratings usually vary in the range of 7-8, while rank has a large variance.

plot(IMDB_data$Rank, IMDB_data$`Runtime (Minutes)`, main = "Rank vs Runtime", xlab = "Rank", ylab = "Runtime", las = 1)

Comments: It can be concluded that a majority of movies have a runtime between 90-130 minutes.

plot(IMDB_data$Rank, log(IMDB_data$Votes), main = "Rank vs Votes", xlab = "Rank", ylab = "Votes", las = 1)

Comments: It can be concluded that log(IMDB_data$votes) varies between 10-14 for a majority of movies that is there are movies of every rank category with a certain range in votes, so the number of votes cannot be a governing factor for rank.

plot(IMDB_categorized_data$`Revenue (Millions)`[Rating_categories == "Low_Ratings"], IMDB_categorized_data$Metascore[Rating_categories == "Low_Ratings"], main = "Revenue vs Metascore 1", xlab = "Revenue", ylab = "Metascore", las = 1)

Comments: It can be seen that in low_rating categories majority of movies generate a revenue below 20 million dollars. Moreover their metascore is also below 55.

plot(IMDB_categorized_data$`Revenue (Millions)`[Rating_categories == "High_Ratings"], IMDB_categorized_data$Metascore[Rating_categories == "High_Ratings"], main = "Revenue vs Metascore 2", xlab = "Revenue", ylab = "Metascore", las = 1)

Comments: It can be seen that for high_ratings category, movies have a metascore varying from 30 to 90 but a majority of movies have a generated revenue below 150 million dollars. So a high/low metascore in high rating categories cannot necessarily impy that it would earn above 150 million dollars.

plot(IMDB_categorized_data$Rating[Metascore_categories == "Low_Metascore"], IMDB_categorized_data$`Revenue (Millions)`[Metascore_categories == "Low_Metascore"], main = "Rating vs Revenue", xlab = "Rating", ylab = "Revenue", las = 1)

Comments: It can be seen that for a low_metascore category, ratings are usually in between 5.5-7.5, while revenue is below 100 million dollars. It can be hence concluded that revenue generation can definitely affect metascore for a movie.

plot(IMDB_categorized_data$Rating[Revenue_categories == "Low_Revenue"], IMDB_categorized_data$Metascore[Revenue_categories == "Low_Revenue"], main = "Rating vs Metascore", xlab = "Rating", ylab = 'Metacore', las = 1)

Comments: It can be said that as the ratings increase the metascore also increases for a majority of low_revenue category movies.

plot(IMDB_categorized_data$Rating, IMDB_categorized_data$`Runtime (Minutes)`, main = "Rating vs Runtime", xlab = "Rating", ylab = "Runtime", las = 1)

Comments: It can be seen that for as runtime increases, ratings also increase in a certain range for a majority of movies.

Comments: ggplots

library(ggplot2)
p1 <- ggplot(data = IMDB_data, aes(Rating))
p1 + geom_histogram(binwidth = 0.1) + labs(title = "Histogram for Rating")

p2 <- ggplot(data = IMDB_data, aes(Votes))
p2 + geom_histogram(binwidth = 50000) + labs(title = "Histogram for Votes")

p3 <- ggplot(data = IMDB_data, aes(Metascore))
p3 + geom_histogram(binwidth = 10) + labs(title = "Histogram for Metascore")
## Warning: Removed 64 rows containing non-finite values (stat_bin).

Comments: qqplots

qqnorm(IMDB_categorized_data$Rating[Revenue_categories == "Low_Revenue"])
qqline(IMDB_categorized_data$Rating[Revenue_categories == "Low_Revenue"], col = "red")

Comments: It can be seen that the data for ratings of low_revenue category movies have a normal distribution for a majority of dataset points.

qqplot(IMDB_categorized_data$Rating[Revenue_categories == "Low_Revenue"], IMDB_categorized_data$Metascore[Revenue_categories == "Low_Revenue"], xlab = "Ratings", ylab = "Metascore", col = "blue", las = 1)

Comments: It can be seen that qqplot for Ratings vs Metascore shows that set points have a normal distribution.